{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 6. Spam classification\n",
    "\n",
    "## (a)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import collections\n",
    "\n",
    "import numpy as np\n",
    "\n",
    "import src.util as util\n",
    "import src.svm as svm"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Process a message into a list of lowercase words:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_words(message):\n",
    "    \"\"\"Get the normalized list of words from a message string.\n",
    "\n",
    "    This function should split a message into words, normalize them, and return\n",
    "    the resulting list. For splitting, you should split on spaces. For normalization,\n",
    "    you should convert everything to lowercase.\n",
    "\n",
    "    Args:\n",
    "        message: A string containing an SMS message\n",
    "\n",
    "    Returns:\n",
    "       The list of normalized words from the message.\n",
    "    \"\"\"\n",
    "\n",
    "    # *** START CODE HERE ***\n",
    "    return message.lower().split()\n",
    "    # *** END CODE HERE ***"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Test it:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['test', 'bla', 'bla', '32', '23']"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_words(\"teST bla bLa 32 23\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Get a dictionary translating common words to integers:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_dictionary(messages):\n",
    "    \"\"\"Create a dictionary mapping words to integer indices.\n",
    "\n",
    "    This function should create a dictionary of word to indices using the provided\n",
    "    training messages. Use get_words to process each message. \n",
    "\n",
    "    Rare words are often not useful for modeling. Please only add words to the dictionary\n",
    "    if they occur in at least five messages.\n",
    "\n",
    "    Args:\n",
    "        messages: A list of strings containing SMS messages\n",
    "\n",
    "    Returns:\n",
    "        A python dict mapping words to integers.\n",
    "    \"\"\"\n",
    "\n",
    "    # *** START CODE HERE ***\n",
    "    words = [word for message in messages for word in get_words(message)]\n",
    "    \n",
    "    word_counter = collections.Counter(words)\n",
    "    common_words = [word for word in word_counter if word_counter[word] >= 5]\n",
    "    return {word:i for i,word in enumerate(common_words)}\n",
    "    # *** END CODE HERE ***"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Test it:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'hi': 0, 'hello': 1, 'moin': 2}"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "messages = [\"hi tom\", \"bye peter\", \"bye\"] + 5*[\"hello\"] + 6*[\"hi\"] + 6*[\"moin\"]\n",
    "dic = create_dictionary(messages)\n",
    "dic"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Encode messages into a numpy array:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def transform_text(messages, word_dictionary):\n",
    "    \"\"\"Transform a list of text messages into a numpy array for further processing.\n",
    "\n",
    "    This function should create a numpy array that contains the number of times each word\n",
    "    appears in each message. Each row in the resulting array should correspond to each \n",
    "    message and each column should correspond to a word.\n",
    "\n",
    "    Use the provided word dictionary to map words to column indices. Ignore words that \n",
    "    are not present in the dictionary. Use get_words to get the words for a message.\n",
    "\n",
    "    Args:\n",
    "        messages: A list of strings where each string is an SMS message.\n",
    "        word_dictionary: A python dict mapping words to integers.\n",
    "\n",
    "    Returns:\n",
    "        A numpy array marking the words present in each message.\n",
    "    \"\"\"\n",
    "    # *** START CODE HERE ***\n",
    "    word_counters = [collections.Counter(get_words(message)) for message in messages]\n",
    "    \n",
    "    m = len(messages)\n",
    "    n = len(word_dictionary)\n",
    "    res = np.zeros((m,n), dtype=int)\n",
    "    for i in range(m):\n",
    "        for word,j in word_dictionary.items():\n",
    "            if word in word_counters[i]:\n",
    "                res[i,j] = word_counters[i][word]\n",
    "    \n",
    "    return res\n",
    "    # *** END CODE HERE ***"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Load training, valuation and test sets:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_messages, train_labels = util.load_spam_dataset('data/ds6_train.tsv')\n",
    "val_messages, val_labels = util.load_spam_dataset('data/ds6_val.tsv')\n",
    "test_messages, test_labels = util.load_spam_dataset('data/ds6_test.tsv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Create a dictionary from the training set and use it to encode all messages:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[1, 1, 1, ..., 0, 0, 0],\n",
       "       [0, 0, 0, ..., 0, 0, 0],\n",
       "       [0, 0, 0, ..., 0, 0, 0],\n",
       "       ...,\n",
       "       [0, 1, 0, ..., 0, 0, 0],\n",
       "       [0, 0, 0, ..., 0, 0, 0],\n",
       "       [0, 0, 0, ..., 0, 0, 0]])"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dictionary = create_dictionary(train_messages)\n",
    "\n",
    "train_matrix = transform_text(train_messages, dictionary)\n",
    "val_matrix = transform_text(val_messages, dictionary)\n",
    "test_matrix = transform_text(test_messages, dictionary)\n",
    "\n",
    "train_matrix[:100,:]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## (b)\n",
    "\n",
    "Messages in the dataset are represented by a vector $x^{(i)} \\in \\mathbb N^n$, where $n$ is the number of words in our dictionary and where $x^{(i)}_k$ is the number of occurences of word $k$ in the $i$-th message.\n",
    "\n",
    "Therefore the maximum likelihood parameters for our model with Laplace smoothing are given by:"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "$$\\begin{align*} \n",
    "\\phi_{k\\mid y=1} &= \\frac{1+\\sum_{i=1}^m x^{(i)}_k  1\\{y^{(i)}=1\\}}{n+\\sum_{i=1}^m 1\\{y^{(i)}=1\\}\\sum_{j=1}^n x^{(i)}_j} \\\\\n",
    "\\phi_{k\\mid y=0} &= \\frac{1+\\sum_{i=1}^m x^{(i)}_k  1\\{y^{(i)}=0\\}}{n+\\sum_{i=1}^m 1\\{y^{(i)}=0\\}\\sum_{j=1}^n x^{(i)}_j} \\\\\n",
    "\\phi_y &= \\frac 1m \\sum_{i=1}^m1\\{y^{(i)}=1\\}\n",
    "\\end{align*}$$"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "where $y^{(i)}$ is the label of message $x^{(i)}$."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def fit_naive_bayes_model(matrix, labels):\n",
    "    \"\"\"Fit a naive bayes model.\n",
    "\n",
    "    This function should fit a Naive Bayes model given a training matrix and labels.\n",
    "\n",
    "    The function should return the state of that model.\n",
    "\n",
    "    Feel free to use whatever datatype you wish for the state of the model.\n",
    "\n",
    "    Args:\n",
    "        matrix: A numpy array containing word counts for the training data\n",
    "        labels: The binary (0 or 1) labels for that training data\n",
    "\n",
    "    Returns: The trained model\n",
    "    \"\"\"\n",
    "\n",
    "    # *** START CODE HERE ***\n",
    "    x = matrix\n",
    "    y = labels\n",
    "    m,n = x.shape\n",
    "\n",
    "\n",
    "    \n",
    "    phi_1 = (1+ np.sum(x[y==1], axis = 0)) / (n + np.sum(x[y==1]))\n",
    "    phi_0 = (1+ np.sum(x[y==0], axis = 0)) / (n + np.sum(x[y==0]))\n",
    "    phi_y = np.mean(labels)\n",
    "    \n",
    "    return (phi_1, phi_0, phi_y)\n",
    "    # *** END CODE HERE ***"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Because we assume that for a message $x\\in \\mathbb N^n$ given its label $y\\in \\{0,1\\}$ is multinomially distributed, we get:"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "$$ p(x\\mid y ) = \\frac {\\left(\\sum_{k=1}^n x_k\\right)!}{\\prod_{k=1}^n x_k!} \\prod_{k=1}^n \\phi_{k\\mid y}^{x_k}.$$"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Because \n",
    "$$ p(y\\mid x) = \\frac{p(x\\mid y)p(y)}{p(x)},$$"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "we have \n",
    "$$ \\frac{p(y=1\\mid x)}{p(y=0\\mid x)} = \\frac{p(x\\mid y=1)\\phi_y}{p(x\\mid y=0) (1-\\phi_y) }$$"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We will classify $x$ as spam if the above fraction is $\\geq 1$, which is equivalent to \n",
    "\n",
    "$$\\begin{align*}\n",
    "\\log \\frac{1-\\phi_y}{\\phi_y} &\\leq \\log \\frac{p(x\\mid y=1)}{p(x\\mid y=0)}\\\\\n",
    "&= \\log \\prod_{k=1}^n\\phi^{x_k}_{k\\mid y=1}\\phi^{-x_k}_{k\\mid y=0}\\\\\n",
    "&= \\sum_{k=1}^n x_k \\left(\\log \\phi_{k\\mid y=1} - \\log \\phi_{k\\mid y=0}\\right).\n",
    "\\end{align*}$$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def predict_from_naive_bayes_model(model, matrix):\n",
    "    \"\"\"Use a Naive Bayes model to compute predictions for a target matrix.\n",
    "\n",
    "    This function should be able to predict on the models that fit_naive_bayes_model\n",
    "    outputs.\n",
    "\n",
    "    Args:\n",
    "        model: A trained model from fit_naive_bayes_model\n",
    "        matrix: A numpy array containing word counts\n",
    "\n",
    "    Returns: A numpy array containg the predictions from the model\n",
    "    \"\"\"\n",
    "    # *** START CODE HERE ***\n",
    "    phi_1, phi_0, phi_y = model\n",
    "    x = matrix\n",
    "    m,n = x.shape\n",
    "    \n",
    "    boundary = np.log(1-phi_y) - np.log(phi_y)\n",
    "    \n",
    "    return boundary <= (x @ (np.log(phi_1) - np.log(phi_0)))\n",
    "    # *** END CODE HERE ***"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Fit a model to the training set and compute its predictions on the test set:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "naive_bayes_model = fit_naive_bayes_model(train_matrix, train_labels)\n",
    "naive_bayes_predictions = predict_from_naive_bayes_model(naive_bayes_model, test_matrix)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Compute the model's accuracy:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Naive Bayes had an accuracy of 0.978494623655914 on the testing set\n"
     ]
    }
   ],
   "source": [
    "naive_bayes_accuracy = np.mean(naive_bayes_predictions == test_labels)\n",
    "print('Naive Bayes had an accuracy of {} on the testing set'.format(naive_bayes_accuracy))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## (c)\n",
    "\n",
    "In our notation we have\n",
    "$$ \\begin{align*}\n",
    "\\log \\frac{p(x_j=i\\mid y=1)}{p(x_j=i\\mid y=0} &= \\log \\frac{\\phi_{i\\mid y=1}}{\\phi_{i\\mid y=0}}\n",
    "\\end{align*}$$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_top_five_naive_bayes_words(model, dictionary):\n",
    "    \"\"\"Compute the top five words that are most indicative of the spam (i.e positive) class.\n",
    "\n",
    "    Ues the metric given in 6c as a measure of how indicative a word is.\n",
    "    Return the words in sorted form, with the most indicative word first.\n",
    "\n",
    "    Args:\n",
    "        model: The Naive Bayes model returned from fit_naive_bayes_model\n",
    "        dictionary: A mapping of word to integer ids\n",
    "\n",
    "    Returns: The top five most indicative words in sorted order with the most indicative first\n",
    "    \"\"\"\n",
    "    # *** START CODE HERE ***\n",
    "    phi_1, phi_0, _ = model\n",
    "    token_ratings = np.log(phi_1/phi_0)\n",
    "    top_five_tokens = np.argpartition(token_ratings,-5)[-5:]\n",
    "    return [ word for word in dictionary if dictionary[word] in top_five_tokens]\n",
    "    # *** END CODE HERE ***"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now we can find the most indicative words:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The top 5 indicative words for Naive Bayes are:  ['claim', 'tone', 'urgent!', 'won', 'prize']\n"
     ]
    }
   ],
   "source": [
    "top_5_words = get_top_five_naive_bayes_words(naive_bayes_model, dictionary)\n",
    "\n",
    "print('The top 5 indicative words for Naive Bayes are: ', top_5_words)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## (d)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "def compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels, radius_to_consider):\n",
    "    \"\"\"Compute the optimal SVM radius using the provided training and evaluation datasets.\n",
    "\n",
    "    You should only consider radius values within the radius_to_consider list.\n",
    "    You should use accuracy as a metric for comparing the different radius values.\n",
    "\n",
    "    Args:\n",
    "        train_matrix: The word counts for the training data\n",
    "        train_labels: The spma or not spam labels for the training data\n",
    "        val_matrix: The word counts for the validation data\n",
    "        val_labels: The spam or not spam labels for the validation data\n",
    "        radius_to_consider: The radius values to consider\n",
    "    \n",
    "    Returns:\n",
    "        The best radius which maximizes SVM accuracy.\n",
    "    \"\"\"\n",
    "    # *** START CODE HERE ***\n",
    "    accuracies = []\n",
    "    \n",
    "    for radius in radius_to_consider:\n",
    "        pred = svm.train_and_predict_svm(train_matrix, train_labels, test_matrix=val_matrix, radius=radius)\n",
    "        accuracies.append(np.mean(pred == val_labels))\n",
    "    \n",
    "    return radius_to_consider[np.argmax(accuracies)]\n",
    "    # *** END CODE HERE ***"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Find the best radius and compute the test accuracy of an RBF-SVM model with that radius:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The optimal SVM radius was 0.1\n",
      "The SVM model had an accuracy of 0.9695340501792115 on the testing set\n"
     ]
    }
   ],
   "source": [
    "    optimal_radius = compute_best_svm_radius(train_matrix, train_labels, val_matrix, val_labels, [0.01, 0.1, 1, 10])\n",
    "    print('The optimal SVM radius was {}'.format(optimal_radius))\n",
    "\n",
    "    svm_predictions = svm.train_and_predict_svm(train_matrix, train_labels, test_matrix, optimal_radius)\n",
    "\n",
    "    svm_accuracy = np.mean(svm_predictions == test_labels)\n",
    "\n",
    "    print('The SVM model had an accuracy of {} on the testing set'.format(svm_accuracy, optimal_radius))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
